#Background #Talking about why I chose this topic #Why is this important? #What was my question(s)? #Talk about the data set(s) that I used
#Talk about the cleaning. This dataset I cleaned in a different file and imported the cleaned data #Doing a A little more cleaning… #Change data types to factors
#Take a quick look at the data frame using skim
| Name | df |
| Number of rows | 1356 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| gender | 0 | 1 | FALSE | 2 | Mal: 684, Fem: 672 |
| smoking_status | 0 | 1 | FALSE | 2 | No: 894, Yes: 462 |
| sleep_type | 0 | 1 | FALSE | 3 | Dee: 452, Lig: 452, REM: 452 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 226.50 | 130.53 | 1.0 | 113.75 | 226.50 | 339.25 | 452.00 | ▇▇▇▇▇ |
| age | 0 | 1.00 | 40.29 | 13.16 | 9.0 | 29.00 | 40.00 | 52.00 | 69.00 | ▂▇▇▇▂ |
| sleep_duration | 0 | 1.00 | 7.47 | 0.87 | 5.0 | 7.00 | 7.50 | 8.00 | 10.00 | ▂▆▇▂▁ |
| sleep_efficiency | 0 | 1.00 | 0.79 | 0.14 | 0.5 | 0.70 | 0.82 | 0.90 | 0.99 | ▃▃▅▇▇ |
| awakenings | 60 | 0.96 | 1.64 | 1.36 | 0.0 | 1.00 | 1.00 | 3.00 | 4.00 | ▅▇▃▃▃ |
| caffeine_consumption | 75 | 0.94 | 23.65 | 30.18 | 0.0 | 0.00 | 25.00 | 50.00 | 200.00 | ▇▃▁▁▁ |
| alcohol_consumption | 42 | 0.97 | 1.17 | 1.62 | 0.0 | 0.00 | 0.00 | 2.00 | 5.00 | ▇▁▁▁▁ |
| exercise_frequency | 18 | 0.99 | 1.79 | 1.43 | 0.0 | 0.00 | 2.00 | 3.00 | 5.00 | ▇▂▅▂▁ |
| sleep_type_percentage | 0 | 1.00 | 33.33 | 18.83 | 7.0 | 19.00 | 24.00 | 55.00 | 75.00 | ▇▆▁▅▂ |
| bedtime | 0 | 1.00 | 2356.48 | 166.90 | 2100.0 | 2200.00 | 2400.00 | 2500.00 | 2630.00 | ▇▅▅▅▆ |
#I was interested to see if bedtime would effect sleep efficiency (what that is…)
# Plot of bedtime effect on sleep efficiency
bed_plot1 <- df %>%
ggplot(aes(x=factor(bedtime), y=sleep_efficiency, color=factor(bedtime))) +
geom_point() +
scale_x_discrete(labels=c("9:00 p.m.", "9:30 p.m.", "10:00 p.m.", "10:30 p.m.", "11:00 p.m.", "12:00 a.m.",
"12:30 a.m.", "1:00 a.m.", "1:30 a.m.", "2:00 a.m.", "2:30 a.m.")) +
labs(x="Bedtime", y="Sleep Efficiency Proportion", title="Scatterplot of Bedtime's Effect on Sleep Efficiency") +
theme_minimal()
# Interactive plot
ggplotly(bed_plot1)
#This didn’t show what I thought it would. It doesn’t appear like sleep efficiency is effected much by bedtime #Let’s create some models to see what does effect sleep efficiency! # I want to see the effects from REM sleep from the sleep_type_percentage column
rem_df <- df %>%
filter(sleep_type == "REM")
mod1REM <- glm(data=rem_df, formula = sleep_efficiency ~ age + gender +
sleep_duration + awakenings + caffeine_consumption +
alcohol_consumption + smoking_status + exercise_frequency + sleep_type_percentage + bedtime)
tidy(mod1REM) %>%
kableExtra::kable() %>%
kableExtra::kable_classic(lightable_options = 'hover')
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.8653406 | 0.0911912 | 9.4892980 | 0.0000000 |
| age | 0.0013841 | 0.0003856 | 3.5899924 | 0.0003744 |
| genderMale | 0.0082960 | 0.0108749 | 0.7628562 | 0.4460262 |
| sleep_duration | -0.0023318 | 0.0055513 | -0.4200353 | 0.6746989 |
| awakenings | -0.0475231 | 0.0038185 | -12.4454136 | 0.0000000 |
| caffeine_consumption | 0.0001605 | 0.0001775 | 0.9044667 | 0.3663256 |
| alcohol_consumption | -0.0237291 | 0.0031003 | -7.6537166 | 0.0000000 |
| smoking_statusYes | -0.0781570 | 0.0106176 | -7.3610714 | 0.0000000 |
| exercise_frequency | 0.0117639 | 0.0037580 | 3.1303375 | 0.0018822 |
| sleep_type_percentage | 0.0016225 | 0.0014384 | 1.1279679 | 0.2600508 |
| bedtime | -0.0000210 | 0.0000322 | -0.6519992 | 0.5147990 |
#REM is not significant on sleep_efficiency, so we will not use that one
#Instead I tried Deep Sleep
deep_df <- df %>%
filter(sleep_type == "Deep") # Time in deep sleep significant. REM is not
mod1 <- lm(data=deep_df, formula = sleep_efficiency ~ age + gender +
sleep_duration + awakenings + caffeine_consumption +
alcohol_consumption + smoking_status + exercise_frequency + sleep_type_percentage + bedtime)
tidy(mod1) %>%
kableExtra::kable() %>%
kableExtra::kable_classic(lightable_options = 'hover')
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.5812434 | 0.0582704 | 9.9749417 | 0.0000000 |
| age | 0.0011359 | 0.0002612 | 4.3491404 | 0.0000176 |
| genderMale | -0.0033367 | 0.0073479 | -0.4540953 | 0.6500216 |
| sleep_duration | 0.0017794 | 0.0037713 | 0.4718332 | 0.6373188 |
| awakenings | -0.0323604 | 0.0026860 | -12.0479598 | 0.0000000 |
| caffeine_consumption | 0.0003140 | 0.0001200 | 2.6161485 | 0.0092502 |
| alcohol_consumption | -0.0078486 | 0.0022348 | -3.5120300 | 0.0004986 |
| smoking_statusYes | -0.0436934 | 0.0073560 | -5.9398190 | 0.0000000 |
| exercise_frequency | 0.0069996 | 0.0025578 | 2.7365283 | 0.0065029 |
| sleep_type_percentage | 0.0051885 | 0.0002460 | 21.0909567 | 0.0000000 |
| bedtime | -0.0000284 | 0.0000218 | -1.3066559 | 0.1921261 |
#Deep Sleep is significant #All others that are *: Age, Awakenings, Alcohol Consumption, Smoking_Status(Yes) #: Caffeine_consumption, exercise_frequency #Things that surprised me: Gender not making a difference, Bedtime of course, Sleep duration (I would have thought that the longer you slept the more quality of sleep you’d have)
#Plot ** and *** #age
p1 <- deep_df %>%
ggplot(aes(x=age, y = sleep_efficiency)) +
geom_point(color="steelblue") +
geom_smooth(color="darkgreen", se=FALSE) +
theme_minimal()
ggplotly(p1)
# Another look
deep_df %>%
ggplot(aes(x=age, y = sleep_efficiency, color=factor(awakenings))) +
geom_point() +
geom_smooth(se=FALSE, method="lm")
#awakenings
p2 <- deep_df %>%
filter(!is.na(awakenings)) %>%
ggplot(aes(x=factor(awakenings), y = sleep_efficiency, fill=factor(awakenings))) + # Remove the legend. Get rid of NA
geom_violin() +
theme_minimal() +
scale_fill_brewer(palette = "Dark2")
ggplotly(p2)
df %>%
filter(!is.na(awakenings)) %>%
ggplot(aes(x=factor(awakenings), y = sleep_efficiency, fill=factor(awakenings))) + # Drop NA. Maybe better as a color or facet
geom_boxplot()
#alcohol
p3 <- deep_df %>%
filter(!is.na(alcohol_consumption)) %>%
ggplot(aes(x=factor(alcohol_consumption), y = sleep_efficiency, fill=factor(alcohol_consumption))) + # Remove the legend. Get rid of NA
geom_violin() +
theme_minimal() +
scale_fill_brewer(palette = "Dark2")
ggplotly(p3)
deep_df %>%
filter(!is.na(alcohol_consumption)) %>%
ggplot(aes(x=factor(alcohol_consumption), y = sleep_efficiency, fill=factor(alcohol_consumption))) +
geom_boxplot()
#smoking
p4 <- deep_df %>%
ggplot(aes(x=factor(smoking_status), y = sleep_efficiency, fill=factor(smoking_status))) + # Remove the legend. Get rid of NA
geom_violin() +
theme_minimal() +
scale_fill_brewer(palette = "Dark2")
ggplotly(p4)
#exercise
p5 <- deep_df %>%
filter(!is.na(exercise_frequency)) %>%
ggplot(aes(x=factor(exercise_frequency), y = sleep_efficiency, fill=factor(exercise_frequency))) + # Remove the legend. Get rid of NA
geom_violin() +
theme_minimal() +
scale_fill_brewer(palette = "Paired")
ggplotly(p5)
df %>%
filter(!is.na(exercise_frequency)) %>%
ggplot(aes(x=factor(exercise_frequency), y = sleep_efficiency, fill=factor(exercise_frequency))) +
geom_boxplot()
#deep sleep
deep_df %>%
ggplot(aes(x=sleep_type_percentage, y = sleep_efficiency)) +
geom_point() +
geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
deep_df %>%
filter(!is.na(awakenings)) %>%
ggplot(aes(x=sleep_type_percentage, y = sleep_efficiency, color=factor(awakenings))) +
geom_point() +
geom_smooth(se=FALSE, method="lm") +
theme_minimal() +
scale_color_brewer(palette = "Set2") +
facet_wrap(~factor(awakenings), scales="free") # makes 0 awakenings kind of funny
## `geom_smooth()` using formula = 'y ~ x'
#caffeine
deep_df %>%
filter(!is.na(caffeine_consumption)) %>%
ggplot(aes(x=factor(caffeine_consumption), y = sleep_efficiency, fill=factor(caffeine_consumption))) + # Drop NA. Probably better as a color or facet
geom_boxplot()
#awakenings,alcohol,smoking
df %>%
filter(!is.na(alcohol_consumption)) %>%
ggplot(aes(x=awakenings, y = sleep_efficiency, color=factor(alcohol_consumption))) + # Remove the legend. Get rid of NA
geom_smooth(method="lm", se=FALSE) +
facet_wrap(~smoking_status, scales="free") +
scale_color_brewer(palette = "Dark2") +
theme_minimal()
#More modeling
mod1 <- lm(data=deep_df, formula = sleep_efficiency ~ bedtime)
mod2 <- lm(data=deep_df, formula = sleep_efficiency ~
age + awakenings + sleep_type_percentage)
mod3 <- lm(data=deep_df, formula = sleep_efficiency ~
age * awakenings * sleep_type_percentage)
mod4 <- lm(data=deep_df, formula = sleep_efficiency ~
age + awakenings + alcohol_consumption + smoking_status +
sleep_type_percentage)
#Find the “Find the name” residuals Mod1
mean(mod1$residuals^2)
## [1] 0.01786273
Mod2
mean(mod2$residuals^2)
## [1] 0.004803478
Mod3
mean(mod3$residuals^2)
## [1] 0.004494944
Mod4
mean(mod4$residuals^2)
## [1] 0.004279514
Which is the best?
compare_performance(mod1, mod2, mod3, mod4)
## When comparing models, please note that probably not all models were fit
## from same data.
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | R2 | R2 (adj.) | RMSE | Sigma
## ------------------------------------------------------------------------------------------------------
## mod1 | lm | -530.6 (<.001) | -530.5 (<.001) | -518.3 (<.001) | 0.021 | 0.019 | 0.134 | 0.134
## mod2 | lm | -1070.2 (<.001) | -1070.1 (<.001) | -1049.9 (0.083) | 0.739 | 0.737 | 0.069 | 0.070
## mod3 | lm | -1090.9 (0.997) | -1090.5 (0.996) | -1054.3 (0.752) | 0.756 | 0.752 | 0.067 | 0.068
## mod4 | lm | -1079.5 (0.003) | -1079.2 (0.004) | -1051.3 (0.165) | 0.768 | 0.765 | 0.065 | 0.066
compare_performance(mod1, mod2, mod3, mod4) %>%
plot()
## When comparing models, please note that probably not all models were fit
## from same data.
#Mod3 appears to be the best
#Add predictions
df2 <- add_predictions(deep_df, mod3)
#Make some hypothetical values for the independent variables in the model
newdf <- data.frame(age = c(70, 8, 5, 80),
awakenings = c(5, 1, 6, 3),
sleep_type_percentage = c(49, 18, 45, 77))
#Make predictions
pred <- predict(mod3, newdata=newdf)
#Combine hypothetical input data with hypothetical predictions into one new data frame
hyp_preds <- data.frame(age = newdf$age,
awakenings = newdf$awakenings,
sleep_type_percentage = newdf$sleep_type_percentage,
pred=pred)
#Add new column showing whether a data point is real or hypothetical
df2$prediction_type <- "Real"
hyp_preds$prediction_type <- "Hypothetical"
#Join real data and hypothetical data (with model predictions)
fullpreds <- full_join(df2, hyp_preds)
ggplot(fullpreds, aes(x = sleep_type_percentage, y = pred, color = prediction_type)) +
geom_point(aes(y = sleep_efficiency), color = "Black", width = 1) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
References
National Sleep Foundation. (2024). Why do we need sleep? Sleep Foundation. Retrieved from https://www.sleepfoundation.org/how-sleep-works/why-do-we-need-sleep